load("p.values.interactions.rda")
load("expression.rda")
load("clinical.rda")
load("pValueNormalnosciGrupShapiro.rda")
load("pValuesNormalnoscLillyforse.rda")
library("ggplot2")
library("grid")
library("gridExtra")
#tabela z wybranymi juĹĽ podczas analizy genami
cancerData_k2 <- na.omit(data.frame (t(expression["TMED4",]), t(expression["SART1",]), t(expression["C6orf145",]),t(expression["PSTK",]), t(expression["HSPE1",]), t(expression["CPNE5",]),as.factor(t(clinical[1,])), as.factor(t(clinical[2,])), as.factor(t(clinical[3,]))))
colnames(cancerData_k2) <- c("TMED4","SART1", "C6orf145", "PSTK", "HSPE1", "CPNE5", "cancer", "wiek", "plec")
In conslusion from Phase 1, we said that after performing ANOVA for all genes qwe obtain that means for all 16115 genes are different among groups of cancers while all residuals are not normally distributed with significance level 5%).
Yet we wish to choose ,,small?? list of genes that will help to identify the type of cancer / cohort in the best way.
In first step, we select genes, which comply with base ANOVAs? assumtions. Our analysis is based on Shapiro-Wilk and Lillforse tests for groups normality and Fligner-Killeen as well as Bartlett?s Test of Homogeneity of Variances.
Firstly, we calculate p-values of Shapiro/Lilliforse test for normality among cancer cohorts.
library(nortest)
ramkaLilly <-as.data.frame(matrix(NA, nrow = 16115, ncol = 13))
start <- 1
end <- 16115
for(N in start:end)
{
cancerData_k <- na.omit(data.frame ( t(expression[N,]),as.factor(t(clinical[1,])),as.factor(t(clinical[2,])),as.factor(t(clinical[3,]))))
colnames(cancerData_k) <- c("gene", "cancer", "wiek", "plec")
groups <- list()
resultscancerShapiro <- list()
resultscancerLilly <- list()
ile<-list()
for(i in 1:nlevels(cancerData_k$cancer))
{
groups[[i]] <- cancerData_k$gene[cancerData_k$cancer==levels(cancerData_k$cancer)[i]]
ile[i] <- length(groups[[i]])
if (ile[i]>4){ resultscancerLilly[[i]] <- lillie.test(groups[[i]])
} else { resultscancerLilly[[i]] <- NA }
if (ile[i]>2){ resultscancerShapiro[[i]] <- shapiro.test(groups[[i]])
} else { resultscancerShapiro[[i]] <- NA }
}
ramkaLilly[N,] <- data.frame(t(sapply(resultscancerLilly, function(x) ifelse(length(x)>1, x$p.value, NA))))
ramkaShapiro[N,] <- data.frame(t(sapply(resultscancerShapiro, function(x) ifelse(length(x)>1, x$p.value, NA))))
}
save(ramkaLilly, file="pVauesNormalnoscLilly.rda")
save(ramkaShapiro, file="pValueNormalnosciGrupShapiro.rda")
Now, we check variances, but only for genes that has some decent number of cancer cohorts that we may expect are normal basing on presious simulations.
This function for a given gene calculates p-value of null hypothesis of variance homogenity
pValueVarianceEquality <- function(gen)
{
df <- data.frame(t(gen), t(clinical[1,])[,1])
colnames(df) <- c("gene", "cancer")
df2 <- df[!is.na(df$gene),]
grupy <- list()
for(i in 1:nlevels(df$cancer))
{
grupy[[i]] <- df2$gene[df2$cancer==levels(df2$cancer)[i]]
}
ktoreOk <- lapply(grupy, length)>2
# bartlett.test(grupy[ktoreOk])$p.value
fligner.test(grupy[ktoreOk])$p.value
}
That function for a given matrix with p-values of null hypo for normality among cancer cohorts finds genes that has at least poziomNormalnosci of normal (as per Shapiro/Lilli tests discussed before) groups and for those genes finds p-value of Bartlett and Fligner tests for variance homogenity
dajListeGenowZpWartoscia <- function(Model, poziomNormalnosci)
{
tempModel <- apply(Model[,-2]>0.05, 1, sum)
genyZwymaganymPoziomemNormalnosci <- tempModel[!is.na(tempModel)]>poziomNormalnosci
nazwyGenowZMin10NormGrupamiModel <- rownames(expression[which(genyZwymaganymPoziomemNormalnosci==TRUE),])
ramkaNormalniejszychGenowDlaTegoModelu <- expression[genyZwymaganymPoziomemNormalnosci, ]
wariancjeWybrancowModelu <- matrix(NA, nrow = sum(genyZwymaganymPoziomemNormalnosci))
for(N in 1:sum(genyZwymaganymPoziomemNormalnosci))
{
wariancjeWybrancowModelu[N] <- pValueVarianceEquality(ramkaNormalniejszychGenowDlaTegoModelu[N,])
}
rownames(wariancjeWybrancowModelu) <- nazwyGenowZMin10NormGrupamiModel
wariancjeWybrancowModelu
}
And here we have the first gene that we are interested in:
modelShapiro <- pValuesGroupsNormalityShapiro
shapiroWariancje <- dajListeGenowZpWartoscia(modelShapiro, 11)
modelLilly <- pValuesNormalnoscLillyforse
lillyWariancje <- dajListeGenowZpWartoscia(modelLilly, 11)
which(lillyWariancje[,1]>0.01)
## PSTK
## 30
Now, we can see that, gene called “PSTK” give the best result for ANOVAs’ assumtions.
ggplot(cancerData_k2, aes(cancer,PSTK)) + geom_boxplot() + coord_flip()
We can see that some of cohort created small group. For example Rectal Cancer has very similiar value of expansion PSTK like Loung Squamous or Acute Myeloid. So in the next step we should find other gene which help us distinguish this cohort. But before, let’s discuss ANOVA results for PSTK.
model1 = aov(PSTK~cancer, data=cancerData_k2)
summary(model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## cancer 11 840.2 76.38 154.1 <2e-16 ***
## Residuals 3387 1679.0 0.50
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Creating graphs:
#install.packages("grid")
library("grid")
#install.packages("gridExtra")
library("gridExtra")
require(ggplot2)
diagPlot<-function(model){
p1<-ggplot(model, aes(.fitted, .resid))+geom_point()
p1<-p1+stat_smooth(method="loess")+geom_hline(yintercept=0, col="red", linetype="dashed")
p1<-p1+xlab("Fitted values")+ylab("Residuals")
p1<-p1+ggtitle("Residual vs Fitted")+theme_bw()
p2<-ggplot(model, aes(qqnorm(.stdresid)[[1]], .stdresid))+geom_point(na.rm = TRUE)
p2<-p2+geom_abline(aes(qqline(.stdresid)))+xlab("Theoretical Quantiles")+ylab("Standardized Residuals")
p2<-p2+ggtitle("Normal Q-Q")+theme_bw()
p3<-ggplot(model, aes(.fitted, sqrt(abs(.stdresid))))+geom_point(na.rm=TRUE)
p3<-p3+stat_smooth(method="loess", na.rm = TRUE)+xlab("Fitted Value")
p3<-p3+ylab(expression(sqrt("|Standardized residuals|")))
p3<-p3+ggtitle("Scale-Location")+theme_bw()
p4<-ggplot(model, aes(seq_along(.cooksd), .cooksd))+geom_bar(stat="identity", position="identity")
p4<-p4+xlab("Obs. Number")+ylab("Cook's distance")
p4<-p4+ggtitle("Cook's distance")+theme_bw()
p5<-ggplot(model, aes(.hat, .stdresid))+geom_point(aes(size=.cooksd), na.rm=TRUE)
p5<-p5+stat_smooth(method="loess", na.rm=TRUE)
p5<-p5+xlab("Leverage")+ylab("Standardized Residuals")
p5<-p5+ggtitle("Residual vs Leverage Plot")
p5<-p5+scale_size_continuous("Cook's Distance", range=c(1,5))
p5<-p5+theme_bw()+theme(legend.position="bottom")
p6<-ggplot(model, aes(.hat, .cooksd))+geom_point(na.rm=TRUE)+stat_smooth(method="loess", na.rm=TRUE)
p6<-p6+xlab("Leverage hii")+ylab("Cook's Distance")
p6<-p6+ggtitle("Cook's dist vs Leverage hii/(1-hii)")
p6<-p6+geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
p6<-p6+theme_bw()
return(list(rvfPlot=p1, qqPlot=p2, sclLocPlot=p3, cdPlot=p4, rvlevPlot=p5, cvlPlot=p6))
}
g<-diagPlot(model1)
do.call(grid.arrange, c(g, list(ncol=3)))
In the chart “Residuals vs Fitted” we see that the residuals of the model are characterized by the same mean independent of the theoretical value of dependent variable. Based on Theoretical Quantiles chart for normal distribution we see that the rest of the model is characterized by a normal distribution. In turn, based on the chart “Scale Location” we find that the variance of the residuals of the model is homogeneous. The graph “Cook’s Distance” show that the cook measure is less than 0.005, indicating no abnormal observation.
qplot(cancer, PSTK, fill=factor(plec), data=cancerData_k2, geom="boxplot", position="dodge") + theme_bw() + coord_flip()
We can see difference between men and women.
coplot(PSTK ~ cancer | plec, data=cancerData_k2, panel=panel.smooth)
The same results give ANOVA model.
aov.out = aov(PSTK ~ plec * cancer, data=cancerData_k2)
kruskal.test(PSTK ~ plec, data=cancerData_k2)
##
## Kruskal-Wallis rank sum test
##
## data: PSTK by plec
## Kruskal-Wallis chi-squared = 185.47, df = 1, p-value < 2.2e-16
options(show.signif.stars=T)
summary(aov.out)
## Df Sum Sq Mean Sq F value Pr(>F)
## plec 1 133.6 133.62 270.122 <2e-16 ***
## cancer 11 706.6 64.24 129.863 <2e-16 ***
## plec:cancer 9 8.4 0.93 1.889 0.049 *
## Residuals 3377 1670.5 0.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#TukeyHSD(aov.out)
#TukeyHSD(aov.out, which=c("cancer"), conf.level=.99)
plot(TukeyHSD(aov.out))
#with(cancerData_k2, pairwise.t.test(PSTK, cancer, p.adjust.method="bonferroni"))
Age doesn’t have a significant infuence:
aov.out.w = aov(PSTK ~ wiek * cancer, data=cancerData_k2)
options(show.signif.stars=T)
kruskal.test(PSTK ~ wiek, data=cancerData_k2)
##
## Kruskal-Wallis rank sum test
##
## data: PSTK by wiek
## Kruskal-Wallis chi-squared = 53.772, df = 70, p-value = 0.9247
summary(aov.out.w)
## Df Sum Sq Mean Sq F value Pr(>F)
## wiek 70 41.4 0.59 1.190 0.136
## cancer 11 832.4 75.67 152.138 <2e-16 ***
## wiek:cancer 488 238.4 0.49 0.982 0.596
## Residuals 2829 1407.0 0.50
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(TukeyHSD(aov.out.w))
#with(cancerData_k2, pairwise.t.test(PSTK, cancer, p.adjust.method="bonferroni"))
Age and gender:
aov.out.wa = aov(PSTK ~ wiek * cancer*plec, data=cancerData_k2)
options(show.signif.stars=T)
summary(aov.out.wa)
Other type of gene which give good results for Shapiro Test is C6orf145. Unfortunately it do not comply with Test of Homogeneity of Variances. Let’s check correlation between this gene.
policzKowariancjeGenow <- function(gen1 , gen2)
{
genA <- t(expression[which(rownames(expression)==gen1),])
genB <- t(expression[which(rownames(expression)==gen2),])
ktoreLiczyc <- !is.na(genA) & !is.na(genB)
cor(genA[ktoreLiczyc,], genB[ktoreLiczyc,])
}
policzKowariancjeGenow("PSTK", "C6orf145")
## [1] -0.1592402
It is ok.
ggplot(cancerData_k2, aes(cancer,C6orf145)) + geom_boxplot()+coord_flip()
model2 = aov(C6orf145~cancer, data=cancerData_k2)
kruskal.test(C6orf145~cancer, data = cancerData_k2)
##
## Kruskal-Wallis rank sum test
##
## data: C6orf145 by cancer
## Kruskal-Wallis chi-squared = 1707.6, df = 11, p-value < 2.2e-16
g<-diagPlot(model2)
do.call(grid.arrange, c(g, list(ncol=3)))
Charts “Residuals vs Fitted” and “Normal Q-Q” are very similar to pervious gene. What is interesting, based on the chart “Scale Location” we find that the variance of the residuals of the model is NOT homogeneous. It is cause of negative result of finger test. In this case, we also have not abnormal observation.
aov.out.wc <- aov(C6orf145 ~ wiek, data=cancerData_k2)
options(show.signif.stars=T)
summary(aov.out.wc)
## Df Sum Sq Mean Sq F value Pr(>F)
## wiek 70 346 4.947 2.149 1.34e-07 ***
## Residuals 3328 7660 2.302
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kruskal.test(C6orf145 ~ wiek, data=cancerData_k2)
##
## Kruskal-Wallis rank sum test
##
## data: C6orf145 by wiek
## Kruskal-Wallis chi-squared = 102.34, df = 70, p-value = 0.007077
Expresion of C6orf145 is connected also witch age, but not gender.
qplot(cancer, C6orf145, fill=factor(plec), data=cancerData_k2, geom="boxplot", position="dodge")+theme_bw()+coord_flip()
aov.out.wac = aov(C6orf145 ~ wiek * cancer*plec, data=cancerData_k2)
options(show.signif.stars=T)
summary(aov.out.wac)
## Df Sum Sq Mean Sq F value Pr(>F)
## wiek 70 346 4.9 5.793 <2e-16 ***
## cancer 11 4823 438.5 513.449 <2e-16 ***
## plec 1 3 2.5 2.936 0.0868 .
## wiek:cancer 488 437 0.9 1.049 0.2424
## wiek:plec 57 35 0.6 0.711 0.9500
## cancer:plec 9 15 1.6 1.920 0.0450 *
## wiek:cancer:plec 186 148 0.8 0.933 0.7274
## Residuals 2576 2200 0.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
HSPE1 and CPNE5 also give good result, if we look on normality distribution in each kind of cancer.
ggplot(cancerData_k2, aes(cancer,HSPE1)) + geom_boxplot()+coord_flip()
model3 = aov(HSPE1~cancer, data=cancerData_k2)
g<-diagPlot(model3)
do.call(grid.arrange, c(g, list(ncol=3)))
aov.outh = aov(HSPE1 ~ cancer * plec, data=cancerData_k2)
summary(aov.outh)
## Df Sum Sq Mean Sq F value Pr(>F)
## cancer 11 655.3 59.58 143.978 <2e-16 ***
## plec 1 1.7 1.73 4.169 0.0412 *
## cancer:plec 9 7.4 0.82 1.990 0.0366 *
## Residuals 3377 1397.3 0.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
options(show.signif.stars=F)
summary(aov.outh)
## Df Sum Sq Mean Sq F value Pr(>F)
## cancer 11 655.3 59.58 143.978 <2e-16
## plec 1 1.7 1.73 4.169 0.0412
## cancer:plec 9 7.4 0.82 1.990 0.0366
## Residuals 3377 1397.3 0.41
ggplot(cancerData_k2, aes(cancer,CPNE5)) + geom_boxplot()+coord_flip()
model3 = aov(HSPE1~cancer, data=cancerData_k2)
g<-diagPlot(model3)
do.call(grid.arrange, c(g, list(ncol=3)))
#as.data.frame(p.values.interactions.rda)
We prepared also data frame with pvalue from dwo-way ANOVA models for interaction gender*cances.
ggplot(cancerData_k2, aes(cancer,SART1)) + geom_boxplot()+coord_flip()
qplot(cancer, SART1, fill=factor(plec), data=cancerData_k2, geom="boxplot", position="dodge")+theme_bw()+coord_flip()
model3 = aov(SART1~cancer, data=cancerData_k2)
g<-diagPlot(model3)
do.call(grid.arrange, c(g, list(ncol=3)))
#as.data.frame(p.values.interactions.rda)
qplot(cancer, TMED4, fill=factor(plec), data=cancerData_k2, geom="boxplot", position="dodge")+theme_bw()+coord_flip()
PSTK show that: -TCGA Rectal are similar to Gliobastoma Cancer -One group created ovarian, Head and Neck, Colon and Acute Myeloid Leukemia Cancer
C6orf145: -find different between TCGA Rectal and Gliobastoma Cancer -extract Colon and Acute Myeloid Leukemia from previous group -show different expresion of genes in ages’ group
HSPE1: -find different between Ovarian and Head and Neck Cancer
CPNE5: -extract Gliobastoma Cancer
SART1: -have similar expresion for each age